Analyzing Elisa’s morphology data in preparation for my talk at Evolution 2017.
There are 3 good methods for creating and saving graphs:
Run pdf(file=“Rplots.pdf”, width=7in, height=7in, …) before the graphics code and then dev.off afterwards. You can specify the name of the file to save and parameters such as size (check help for other optional arguments). This method will create and save the graphic but will NOT print it to the console or screen, you have to open the saved file to see the graphic.
Run dev.print(device=pdf, file=“Rplots.pdf”, width=7in, height=7in, …) AFTER the graphics code. This function copies the current graphic being displayed (that you just ran) and prints it to the device you specified (pdf in this example). This method first creates the graphic and prints it to the Viewer/Window and then creates a copy saved as the file you specified. Thus, you can see how it looks before you decide to save it.
If you are using ggplot2, you can create and print the graphic and then use ggsave(file=filename, …). Other arguments are available but not required, will use defaults.
Note about pdf(): The pdf function will make assumptions about the font and colors which may not match what you would see if you printed to the Viewer/Window. You are able to specify these arguments.
However, note that some things like fonts and color can change between the screen printed version and the saved version as pdf does not know what options R was using. You are able to specify these arguments.
Loading original dataset and doing cleanup.
load("morphdata_qc.RData") # cray_morph
ls()
head(data)
table(data$group, data$field.id)
table(data$pop, data$field.id)
# excluding the Rusty? individual which was found on a supposedly allopatric Sanborn R.
data <- droplevels(subset(data, field.id!="Rusty?"))
table(data$group, data$field.id)
# create data subsets
sanborn <- droplevels(subset(data, group=="sanborn"))
rusty <- droplevels(subset(data, group=="rusty"))
huron <- droplevels(subset(data, group=="huron"))
kokosing <- droplevels(subset(data, group=="kokosing"))
native <- rbind(rusty, sanborn)
| pop | carapaceL | carapaceWL | areolaLcaraL | areolaWL | rostrumWcaraL | acumenLcaraL | rostrumWacumenL |
|---|---|---|---|---|---|---|---|
| CRM | 30.68000 | 0.5102792 | 0.3409399 | 0.1902244 | 0.0936602 | 0.0918004 | 1.0139108 |
| SCL | 25.89429 | 0.4799137 | 0.3361236 | 0.2069425 | 0.0994951 | 0.1087142 | 0.9371758 |
| VSG | 32.52385 | 0.5105512 | 0.3481707 | 0.1783063 | 0.0879885 | 0.0809147 | 1.1040058 |
| pop | carapaceL | carapaceWL | areolaLcaraL | areolaWL | rostrumWcaraL | acumenLcaraL | rostrumWacumenL |
|---|---|---|---|---|---|---|---|
| BOK | 37.15571 | 0.5129810 | 0.3535049 | 0.1535454 | 0.0909748 | 0.0824252 | 1.120008 |
| STW | 40.86583 | 0.5293463 | 0.3537504 | 0.1537282 | 0.0914164 | 0.0850582 | 1.082325 |
| pop | carapaceL | carapaceWL | areolaLcaraL | areolaWL | rostrumWcaraL | acumenLcaraL | rostrumWacumenL |
|---|---|---|---|---|---|---|---|
| KDN | 28.45286 | 0.4916803 | 0.3434465 | 0.1789597 | 0.0952211 | 0.0885287 | 1.083970 |
| KGL | 28.42182 | 0.5015607 | 0.3434980 | 0.1743737 | 0.1010221 | 0.0936222 | 1.112321 |
| KOA | 32.42714 | 0.4984030 | 0.3424494 | 0.1844100 | 0.1005991 | 0.0872051 | 1.172140 |
| KOB | 27.17667 | 0.4885441 | 0.3212899 | 0.2585956 | 0.1065684 | 0.0870347 | 1.315767 |
| KOC | 30.35143 | 0.5095667 | 0.3443291 | 0.1788985 | 0.0967884 | 0.0873160 | 1.123038 |
| KOK | 35.37643 | 0.5274117 | 0.3620968 | 0.1592829 | 0.0924775 | 0.0756059 | 1.353648 |
| pop | carapaceL | carapaceWL | areolaLcaraL | areolaWL | rostrumWcaraL | acumenLcaraL | rostrumWacumenL |
|---|---|---|---|---|---|---|---|
| HHT | 31.96714 | 0.4807552 | 0.3349861 | 0.1618156 | 0.0938101 | 0.0956487 | 0.9943562 |
| HRM | 36.80846 | 0.5198819 | 0.3578914 | 0.1580438 | 0.0893753 | 0.0742407 | 1.2349134 |
| HUX | 33.93786 | 0.4982206 | 0.3365541 | 0.1695481 | 0.0949176 | 0.0908915 | 1.0532092 |
| LLB | 31.13857 | 0.5162297 | 0.3503549 | 0.1575172 | 0.0916738 | 0.0863232 | 1.0920800 |
| TMR | 33.01857 | 0.4896140 | 0.3365277 | 0.1716353 | 0.0948596 | 0.0917589 | 1.0414049 |
Multipanel graphs can be built in base R with a separate line of code for each desired plot and using graphical parameters to define placement of plots.
Alternatively, such graphs can be built using lattice or ggplot2.
The method using graphical parameters is described here and lattice is generally used.
Using par() sets the graphical parameters, such as how to layout the plots in the viewing window. mfrow=c(rows,cols) tells R how many rows and columns of graphs you want to print at once. R will create each graph individually and then add them to the plot.
Use dev.off() when you are finished to reset the graphical parameters (so R will not try to use the same settings for the next graph that you want to make).
names(data)
par(mfrow=c(2,2))
with(data, boxplot(carapaceL ~ group, xlab="label me", ylab="labelme"))
with(data, boxplot(carapaceW ~ group))
with(data, boxplot(areolaL ~ group))
with(data, boxplot(areolaW ~ group))
dev.off()
histogram(~ carapaceL, data)
histogram(~ log(carapaceL), data)
histogram(~ log10(carapaceL), data)
with(data, boxplot(areolaL ~ pop, las=2))
with(data, xyplot(carapaceW ~ carapaceL, groups=group, pch=c(15, 15, 17, 19), col=c("purple", "violet", "red", "green"), cex=1.1))
levels(data$group)
with(data, xyplot(chelaL ~ carapaceL, groups=group, pch=c(15, 15, 17, 19), col=c("purple", "violet", "red", "green"), cex=1.1))
with(data, xyplot(dactylL ~ chelaL, groups=group, pch=c(15, 15, 17, 19), col=c("purple", "violet", "red", "green"), cex=1.1))
with(data, dotplot(dactylLchL ~ group, cex=1.1))
with(huron, dotplot(dactylLchL ~ pop, cex=1.1))
with(kokosing, dotplot(dactylLchL ~ pop, cex=1.1))
levels(native$group)
## [1] "rusty" "sanborn"
plot(chelaL ~ carapaceL, data=native, col=c("red","blue")[native$group], pch=c(17,19)[native$group])
abline(lm(chelaL ~ carapaceL, data=rusty), col="red")
abline(lm(chelaL ~ carapaceL, data=sanborn), col="blue")
y <- native$chelaL
x <- native$carapaceL
rus <- lm(y ~ x, data=native, subset=(group=="rusty"))
san <- lm(y ~ x, data=native, subset=(group=="sanborn"))
par(mfrow=c(1,2))
dataEllipse(x, y, groups=native$group, levels=c(0.95), col=c("red", "blue"))
abline(rus, col="red")
abline(san, col="blue")
range(x, na.rm=T)
## [1] 18.82 52.05
range(y, na.rm=T)
## [1] 10.72 57.97
with(native, boxplot(chelaLcaraL ~ group, las=1, ylab="Ratio of chela length to carapace length"))
par(mfrow=c(1,1))
# be sure to reset the par before trying to create another plot
densityplot(~ carapaceL, data=native, groups=group, auto.key=T)
densityplot(~ carapaceW, data=native, groups=group, auto.key=T)
densityplot(~ carapaceWL, data=native, groups=group, auto.key=T)
densityplot(~ chelaL/carapaceL, data=native, groups=group, auto.key=T)
densityplot(~ areolaW/carapaceL, data=native, groups=group, auto.key=T)
splom(~ data[c("carapaceL", "carapaceW", "areolaL", "areolaW")] | group, data=data)
splom(~ data[c("carapaceL", "carapaceW", "areolaL", "areolaW")], data=data, groups=group, auto.key=list(space="right"))
splom(~ data[c("carapaceL", "carapaceW", "areolaL", "areolaW")], data=data, groups=pop, auto.key=list(space="right"))
head(data)
with(data, boxplot(carapaceL ~ Korder, names=c("Sanborn", "KOB", "KOA", "KOC", "KOK", "KDN", "KGL", "Rusty")))
with(data, xyplot(dactylLchL ~ Korder))
with(data, boxplot(dactylLchL ~ Horder))
with(data, bwplot(dactylLchL ~ Horder, horizontal=F))
with(data, xyplot(dactylLchL ~ Horder))
with(data, bwplot(dactylLchL ~ pop | group, horizontal=F, scales=list(relation="free", rot=90)))
with(data, boxplot(carapaceL ~ pop, las=2))
ldata <- data[,c(1:18,20:22,25:37)]
names(ldata)
ldata <- gather(ldata, "trait", "value", c(9:34), na.rm=T, factor_key=T)
str(ldata)
levels(ldata$trait)
levels(ldata$trait)[1:3]
with(droplevels(subset(ldata, trait%in%c("carapaceL", "carapaceW"))), bwplot(value ~ group | trait, horizontal=F, scales=list(relation="free", rot=0)))
with(data, bwplot(carapaceL + carapaceW ~ group, allow.multiple=T, horizontal=F, outer=T, scales=list(relation="free"), ylab=""))
What do I want to do here?
carapaceWL.aov <- aov(carapaceWL ~ group/pop, data=native) # runs the anova
anova(carapaceWL.aov) # summary of the analysis
TukeyHSD(carapaceWL.aov, "group")
koko <- droplevels(subset(data, group%in%c("sanborn", "kokosing", "rusty")))
kokoRus <- droplevels(subset(koko, morpho_species=="Rusty" & group!="sanborn"))
kokoRus.mod <- aov(carapaceWL ~ group, data=kokoRus)
summary(kokoRus.mod)
TukeyHSD(kokoRus.mod)
kokoSan <- droplevels(subset(koko, morpho_species=="Sanborn" & group!="rusty"))
kokoSan.mod <- aov(carapaceWL ~ group, data=kokoSan)
summary(kokoSan.mod)
TukeyHSD(kokoSan.mod)
# names(native)
y.raw <- as.matrix(native[,9:18], dimnames=list(NULL, colnames(native)[11:20]))
manova(y.raw ~ group/pop, data=native)
## Call:
## manova(y.raw ~ group/pop, data = native)
##
## Terms:
## group group:pop Residuals
## resp 1 1187.065 369.833 1279.502
## resp 2 185.827 53.764 192.915
## resp 3 0.476 0.722 6.545
## resp 4 423.073 155.378 401.717
## resp 5 3.898 1.211 12.490
## resp 6 8.985 1.277 12.459
## resp 7 858.610 309.058 1314.523
## resp 8 1838.856 649.440 3587.033
## resp 9 319.549 120.797 547.137
## resp 10 145.651 65.265 575.169
## Deg. of Freedom 1 3 59
##
## Residual standard errors: 4.656874 1.808245 0.3330541 2.609362 0.460094 0.4595363 4.720175 7.797254 3.045244 3.12228
## 5 out of 10 effects not estimable
## Estimated effects may be unbalanced
## 3 observations deleted due to missingness
fit <- manova(y.raw ~ group/pop, data=native)
summary(fit)
## Df Pillai approx F num Df den Df Pr(>F)
## group 1 0.75462 15.3768 10 50 4.513e-12 ***
## group:pop 3 1.01014 2.6398 30 156 5.651e-05 ***
## Residuals 59
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# names(data)
pc.mod <- prcomp(na.omit(data[,9:18]))
summary(pc.mod)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 12.2743 2.79074 1.3727 1.0037 0.63925 0.53492
## Proportion of Variance 0.9272 0.04793 0.0116 0.0062 0.00251 0.00176
## Cumulative Proportion 0.9272 0.97509 0.9867 0.9929 0.99540 0.99716
## PC7 PC8 PC9 PC10
## Standard deviation 0.46457 0.33057 0.29133 0.22836
## Proportion of Variance 0.00133 0.00067 0.00052 0.00032
## Cumulative Proportion 0.99848 0.99916 0.99968 1.00000
biplot(pc.mod)
Let’s do a PCA for basic body size measurements using carapaceL, carapaceW, chelaL, palmW
names(data)
## [1] "group" "id" "pop"
## [4] "ind" "key.species" "field.id"
## [7] "sex" "form" "carapaceL"
## [10] "areolaL" "areolaW" "carapaceW"
## [13] "acumenL" "rostrumW" "dactylL"
## [16] "chelaL" "palmW" "palmL"
## [19] "carpus.spines" "gonopodL" "mesial.projL"
## [22] "central.projL" "annulus.ventralis" "notes"
## [25] "carapaceWL" "areolaLcaraL" "areolaWL"
## [28] "rostrumWcaraL" "acumenLcaraL" "rostrumWacumenL"
## [31] "dactylLchL" "dactylLpalmL" "palmWchL"
## [34] "palmLchL" "palmWL" "chelaLcaraL"
## [37] "centralLgonoL" "centLmesLgonoL"
body1.pca <- prcomp(na.omit(data[,c("carapaceL", "carapaceW", "chelaL", "palmW")]))
summary(body1.pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 10.4932 2.53632 0.97915 0.53387
## Proportion of Variance 0.9348 0.05462 0.00814 0.00242
## Cumulative Proportion 0.9348 0.98944 0.99758 1.00000
biplot(body1.pca)
ratio1.pca <- prcomp(na.omit(data[,c(25:35)]))
summary(ratio1.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 0.2818 0.2336 0.12246 0.06769 0.03615 0.02049
## Proportion of Variance 0.5098 0.3503 0.09627 0.02941 0.00839 0.00270
## Cumulative Proportion 0.5098 0.8601 0.95637 0.98578 0.99417 0.99687
## PC7 PC8 PC9 PC10 PC11
## Standard deviation 0.01550 0.01083 0.009583 0.004619 0.004176
## Proportion of Variance 0.00154 0.00075 0.000590 0.000140 0.000110
## Cumulative Proportion 0.99841 0.99916 0.999750 0.999890 1.000000
biplot(ratio1.pca)
# names(native)
mydata <- na.omit(native[,c(1:4,9:18)])
# names(mydata)
prcomp.native <- prcomp(mydata[,5:14])
summary(prcomp.native)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 14.8412 3.0552 1.13177 0.74371 0.53836 0.41086
## Proportion of Variance 0.9486 0.0402 0.00552 0.00238 0.00125 0.00073
## Cumulative Proportion 0.9486 0.9888 0.99432 0.99670 0.99795 0.99868
## PC7 PC8 PC9 PC10
## Standard deviation 0.37133 0.28799 0.23886 0.17175
## Proportion of Variance 0.00059 0.00036 0.00025 0.00013
## Cumulative Proportion 0.99927 0.99963 0.99987 1.00000
str(prcomp.native)
## List of 5
## $ sdev : num [1:10] 14.841 3.055 1.132 0.744 0.538 ...
## $ rotation: num [1:10, 1:10] 0.429 0.1669 0.013 0.2497 0.0223 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:10] "carapaceL" "areolaL" "areolaW" "carapaceW" ...
## .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:10] 33.14 11.48 1.96 16.87 2.92 ...
## ..- attr(*, "names")= chr [1:10] "carapaceL" "areolaL" "areolaW" "carapaceW" ...
## $ scale : logi FALSE
## $ x : num [1:64, 1:10] -3.05 10.08 -6.03 -3.08 3.87 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:64] "1" "2" "3" "4" ...
## .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
plot(prcomp.native$x)
For intro/background, see Strauss’ Chapter 4. Discriminating Groups of Organisms in the book Morphometrics for Nonmorphometricians by Elewa (which I own).
Quick-R has a page for Discriminant Function Analysis
MASS package for lineage and quadratic DFA. “Unless prior probabilities are specified, each assumes proportional prior probabilities (i.e., prior probabilities are based on samples sizes).”
lower case letters = numeric variables upper case letters = categorical factors
Following code does an LDA using listwise deletion of missing data. CV=T give jacknifed predictions.
Remove NAs from the dataset:
# names(native)
train <- na.omit(native[,c(1:4,9:18)])
Run the LDA, with jacknife to assess accuracy of the model.
fit.cv <- lda(group ~ carapaceL + carapaceW + areolaL + areolaW + acumenL + rostrumW + dactylL + chelaL + palmW + palmL, data=train, na.action="na.omit", CV=T)
# percent correct for each category of categorical variable
ct <- table(train$group, fit.cv$class)
diag(prop.table(ct,1))
## rusty sanborn
## 0.8800000 0.8717949
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.875
Run the regular LDA without a jacknife (this one can be plotted).
fit <- lda(group ~ carapaceL + carapaceW + areolaL + areolaW + acumenL + rostrumW + dactylL + chelaL + palmW + palmL, data=train, na.action="na.omit")
fit
## Call:
## lda(group ~ carapaceL + carapaceW + areolaL + areolaW + acumenL +
## rostrumW + dactylL + chelaL + palmW + palmL, data = train,
## na.action = "na.omit")
##
## Prior probabilities of groups:
## rusty sanborn
## 0.390625 0.609375
##
## Group means:
## carapaceL carapaceW areolaL areolaW acumenL rostrumW dactylL
## rusty 38.52160 20.08520 13.61280 2.071600 3.224800 3.507200 20.33400
## sanborn 29.69436 14.81538 10.12026 1.894872 2.718974 2.739231 12.82667
## chelaL palmW palmL
## rusty 33.29040 13.627600 12.552800
## sanborn 22.30385 9.047692 9.460769
##
## Coefficients of linear discriminants:
## LD1
## carapaceL 0.03899889
## carapaceW 0.07964590
## areolaL -0.32913984
## areolaW 1.33778228
## acumenL -0.29734309
## rostrumW -1.85492327
## dactylL -0.40488119
## chelaL 0.32838823
## palmW -0.86592076
## palmL 0.76377563
Visualize the results. Using CV=T returns results as a list and this prevents plotting. Run the analysis without CV=T to get a “lda” class object returned, which can then be manipulated/plotted as below.
There is only 1 LD axis so no scatterplot here.
plot(fit)
str(fit)
## List of 10
## $ prior : Named num [1:2] 0.391 0.609
## ..- attr(*, "names")= chr [1:2] "rusty" "sanborn"
## $ counts : Named int [1:2] 25 39
## ..- attr(*, "names")= chr [1:2] "rusty" "sanborn"
## $ means : num [1:2, 1:10] 38.5 29.7 20.1 14.8 13.6 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "rusty" "sanborn"
## .. ..$ : chr [1:10] "carapaceL" "carapaceW" "areolaL" "areolaW" ...
## $ scaling: num [1:10, 1] 0.039 0.0796 -0.3291 1.3378 -0.2973 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:10] "carapaceL" "carapaceW" "areolaL" "areolaW" ...
## .. ..$ : chr "LD1"
## $ lev : chr [1:2] "rusty" "sanborn"
## $ svd : num 10.8
## $ N : int 64
## $ call : language lda(formula = group ~ carapaceL + carapaceW + areolaL + areolaW + acumenL + rostrumW + dactylL + chelaL + palmW + palmL, data = train, ...
## $ terms :Classes 'terms', 'formula' language group ~ carapaceL + carapaceW + areolaL + areolaW + acumenL + rostrumW + dactylL + chelaL + palmW + palmL
## .. ..- attr(*, "variables")= language list(group, carapaceL, carapaceW, areolaL, areolaW, acumenL, rostrumW, dactylL, chelaL, palmW, palmL)
## .. ..- attr(*, "factors")= int [1:11, 1:10] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:11] "group" "carapaceL" "carapaceW" "areolaL" ...
## .. .. .. ..$ : chr [1:10] "carapaceL" "carapaceW" "areolaL" "areolaW" ...
## .. ..- attr(*, "term.labels")= chr [1:10] "carapaceL" "carapaceW" "areolaL" "areolaW" ...
## .. ..- attr(*, "order")= int [1:10] 1 1 1 1 1 1 1 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(group, carapaceL, carapaceW, areolaL, areolaW, acumenL, rostrumW, dactylL, chelaL, palmW, palmL)
## .. ..- attr(*, "dataClasses")= Named chr [1:11] "factor" "numeric" "numeric" "numeric" ...
## .. .. ..- attr(*, "names")= chr [1:11] "group" "carapaceL" "carapaceW" "areolaL" ...
## $ xlevels: Named list()
## - attr(*, "class")= chr "lda"
# to see what happens if you leave out variables:
update(fit, .~. -acumenL -rostrumW)
## Call:
## lda(group ~ carapaceL + carapaceW + areolaL + areolaW + dactylL +
## chelaL + palmW + palmL, data = train, na.action = "na.omit")
##
## Prior probabilities of groups:
## rusty sanborn
## 0.390625 0.609375
##
## Group means:
## carapaceL carapaceW areolaL areolaW dactylL chelaL palmW
## rusty 38.52160 20.08520 13.61280 2.071600 20.33400 33.29040 13.627600
## sanborn 29.69436 14.81538 10.12026 1.894872 12.82667 22.30385 9.047692
## palmL
## rusty 12.552800
## sanborn 9.460769
##
## Coefficients of linear discriminants:
## LD1
## carapaceL -0.165928047
## carapaceW -0.005888763
## areolaL -0.103838748
## areolaW 1.406282101
## dactylL -0.443716579
## chelaL 0.289152220
## palmW -0.586321401
## palmL 0.617553590
# library(klaR)
# # this produces a VERY large figure as it's doing pairwise scatterplots
# partimat(group ~ carapaceL + carapaceW + areolaL + areolaW, data=train, method="lda", nplots.vert=3, nplots.hor=3)
This also produces a single axis. Results are precisely the same in terms of accuracy of the prediction; not printed here.
Seems like you can then use the fitted model to predict the group identity of new samples…
In many cases, people log transform these types of variables because it makes bivariate relationships more linear and because “distances increase by multiplicative growth” (Cadrin 2010 in Morphometrics for Nonmorphometricians, ed. Elewa). NOTE: People seem to be using log base 10 here, not natural log (which is the default of the log() function in R). So it’s necessary to specify base 10.
Not clear to me that it made much difference in terms of linearity but let’s see what the analysis looks like.
Using logged values was actually less good at separating the two taxa; results not printed.
Ok, so I’m not sure I need to use the log transformation, it does not seem that different and if anything is a worse fit.
library(mvoutlier)
library(mvnormtest)
outliers <- aq.plot(train[5:14])$outliers
## Projection to the first and second robust principal components.
## Proportion of total variation (explained variance): 0.8304591
with(train, train[which(outliers==T),])
## group id pop ind carapaceL areolaL areolaW carapaceW acumenL
## 5 rusty BOK:5 BOK 5 36.58 13.31 1.93 18.76 2.31
## 8 rusty BOK:8 BOK 8 34.11 11.94 1.47 17.08 2.61
## 13 rusty BOK:13 BOK 13 40.11 14.67 1.61 20.58 3.22
## 14 rusty BOK:14 BOK 14 43.23 15.56 2.32 21.78 3.81
## 190 rusty STW:6 STW 6 36.94 13.19 2.97 19.22 2.81
## 191 rusty STW:7 STW 7 46.15 16.53 2.42 25.29 3.45
## 195 rusty STW:12 STW 12 52.05 17.82 2.62 26.53 5.24
## 196 rusty STW:13 STW 13 38.84 14.25 1.87 21.17 3.20
## 197 rusty STW:14 STW 14 45.42 16.59 2.31 24.37 3.99
## 24 sanborn CRM:10 CRM 10 30.09 9.80 1.81 14.67 3.31
## 26 sanborn CRM:12 CRM 12 33.85 11.23 2.27 17.60 3.23
## 27 sanborn CRM:13 CRM 13 33.96 11.99 2.01 17.13 3.09
## 174 sanborn SCL:3 SCL 3 30.74 10.39 2.14 15.32 3.68
## 177 sanborn SCL:6 SCL 6 22.12 7.53 1.52 10.30 2.05
## 181 sanborn SCL:10 SCL 10 19.88 5.86 1.15 9.10 3.01
## 213 sanborn VSG:2 VSG 2 28.93 9.58 1.85 14.63 2.10
## 214 sanborn VSG:3 VSG 3 32.49 11.72 1.93 17.66 2.02
## 218 sanborn VSG:7 VSG 7 33.45 11.85 1.84 15.27 2.35
## 223 sanborn VSG:12 VSG 12 39.10 13.42 2.13 19.41 3.08
## 224 sanborn VSG:13 VSG 13 33.80 11.58 2.04 16.27 2.91
## 225 sanborn VSG:14 VSG 14 29.19 11.27 2.05 16.02 2.70
## rostrumW dactylL chelaL palmW palmL
## 5 3.12 16.64 27.58 12.63 11.31
## 8 3.07 18.80 30.49 13.12 11.71
## 13 3.33 22.20 42.52 17.86 16.70
## 14 3.96 27.98 44.07 18.55 17.66
## 190 3.36 17.13 26.51 10.64 8.97
## 191 3.97 28.19 43.94 17.28 16.36
## 195 5.51 36.06 57.97 20.57 21.45
## 196 3.39 26.53 42.51 17.02 15.68
## 197 3.94 30.14 48.09 20.30 18.12
## 24 2.91 15.40 26.93 11.57 12.61
## 26 3.29 19.27 33.28 13.20 14.12
## 27 3.17 19.63 33.22 13.21 14.60
## 174 3.08 10.46 18.34 7.54 9.18
## 177 1.99 5.86 10.72 4.17 4.57
## 181 2.07 6.15 11.24 4.71 5.65
## 213 2.63 10.19 16.75 7.93 7.91
## 214 2.72 14.07 23.52 10.69 9.50
## 218 2.88 18.64 30.47 11.12 10.75
## 223 3.17 23.90 41.20 16.99 17.47
## 224 3.06 16.23 24.76 8.17 7.89
## 225 2.62 12.75 22.07 9.51 9.24
# names(train)
Raw measures look better than ratios, more variance explained, fewer outliers.
## [1] "carapaceL"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.96395, p-value = 0.05847
##
## [1] "areolaL"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.97631, p-value = 0.2549
##
## [1] "areolaW"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.98987, p-value = 0.8807
##
## [1] "carapaceW"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.95707, p-value = 0.02586
##
## [1] "acumenL"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.91816, p-value = 0.0004167
##
## [1] "rostrumW"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.92248, p-value = 0.0006317
##
## [1] "dactylL"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.95392, p-value = 0.01791
##
## [1] "chelaL"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.95638, p-value = 0.02385
##
## [1] "palmW"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.96206, p-value = 0.04668
##
## [1] "palmL"
##
## Shapiro-Wilk normality test
##
## data: train[[i]]
## W = 0.95211, p-value = 0.01456
Plots look ok, not fabulous toward the ends but ok. (The log-transformed data look much worse.) Quite a few variables fail the normality test.
Let’s see what happens if I remove the points identified above as outliers.
## < table of extent 0 >
## [1] "carapaceL"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.94929, p-value = 0.05614
##
## [1] "areolaL"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.96677, p-value = 0.2437
##
## [1] "areolaW"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.9856, p-value = 0.8584
##
## [1] "carapaceW"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.93813, p-value = 0.0223
##
## [1] "acumenL"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.966, p-value = 0.2287
##
## [1] "rostrumW"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.97021, p-value = 0.3214
##
## [1] "dactylL"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.96754, p-value = 0.2595
##
## [1] "chelaL"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.97068, p-value = 0.3336
##
## [1] "palmW"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.97671, p-value = 0.5227
##
## [1] "palmL"
##
## Shapiro-Wilk normality test
##
## data: test[[i]]
## W = 0.9666, p-value = 0.2404
That substantially improves the fit. Let’s also see what happens if I test only sanborns and only rustys.
## [1] "carapaceL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.90968, p-value = 0.004229
##
## [1] "areolaL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.94163, p-value = 0.04308
##
## [1] "areolaW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.98611, p-value = 0.9033
##
## [1] "carapaceW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.89115, p-value = 0.001242
##
## [1] "acumenL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.97807, p-value = 0.6335
##
## [1] "rostrumW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.92174, p-value = 0.00985
##
## [1] "dactylL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.96349, p-value = 0.2328
##
## [1] "chelaL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.96401, p-value = 0.2421
##
## [1] "palmW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.95916, p-value = 0.1672
##
## [1] "palmL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.97118, p-value = 0.408
## [1] "carapaceL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.93621, p-value = 0.121
##
## [1] "areolaL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.95069, p-value = 0.2599
##
## [1] "areolaW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.97406, p-value = 0.7483
##
## [1] "carapaceW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.9349, p-value = 0.1128
##
## [1] "acumenL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.84614, p-value = 0.001489
##
## [1] "rostrumW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.80907, p-value = 0.0003233
##
## [1] "dactylL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.91049, p-value = 0.03127
##
## [1] "chelaL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.91809, p-value = 0.04637
##
## [1] "palmW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.94124, p-value = 0.1581
##
## [1] "palmL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.92172, p-value = 0.05609
That’s much worse for both groups!
What about the complete dataset, including the invaded rivers? Nearly everything is non-normal:
## [1] "carapaceL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.98525, p-value = 0.02293
##
## [1] "areolaL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.98055, p-value = 0.004196
##
## [1] "areolaW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.97218, p-value = 0.0002678
##
## [1] "carapaceW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.98285, p-value = 0.009512
##
## [1] "acumenL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.9718, p-value = 0.000257
##
## [1] "rostrumW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.96617, p-value = 4.528e-05
##
## [1] "dactylL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.9522, p-value = 1.421e-06
##
## [1] "chelaL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.95508, p-value = 2.831e-06
##
## [1] "palmW"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.96052, p-value = 1.115e-05
##
## [1] "palmL"
##
## Shapiro-Wilk normality test
##
## data: x[[i]]
## W = 0.9274, p-value = 8.15e-09
Maybe this relates to the fact that hybridization is occurring?? On we go…
##
## Shapiro-Wilk normality test
##
## data: Z
## W = 0.76085, p-value = 7.508e-09
So that’s a tiny p-value which I guess means the data are not in multivariate normality.
Graphical assessment of multivariate normality
x <- as.matrix(train[,5:14]) # n x p numeric matrix
center <- colMeans(x) # centroid
n <- nrow(x)
p <- ncol(x)
cov <- cov(x)
d <- mahalanobis(x, center, cov) # distances
qqplot(qchisq(ppoints(n), df=p), d, main="QQ Plot Assessing Multivariate Normality", ylab="Mahalanobis D2")
abline(a=0,b=1)
train[d>20,]
## group id pop ind carapaceL areolaL areolaW carapaceW acumenL
## 13 rusty BOK:13 BOK 13 40.11 14.67 1.61 20.58 3.22
## 195 rusty STW:12 STW 12 52.05 17.82 2.62 26.53 5.24
## rostrumW dactylL chelaL palmW palmL
## 13 3.33 22.20 42.52 17.86 16.70
## 195 5.51 36.06 57.97 20.57 21.45
train[train$carapaceL>40,]
## group id pop ind carapaceL areolaL areolaW carapaceW acumenL
## 13 rusty BOK:13 BOK 13 40.11 14.67 1.61 20.58 3.22
## 14 rusty BOK:14 BOK 14 43.23 15.56 2.32 21.78 3.81
## 186 rusty STW:2 STW 2 48.54 17.46 2.40 27.28 3.58
## 191 rusty STW:7 STW 7 46.15 16.53 2.42 25.29 3.45
## 195 rusty STW:12 STW 12 52.05 17.82 2.62 26.53 5.24
## 197 rusty STW:14 STW 14 45.42 16.59 2.31 24.37 3.99
## rostrumW dactylL chelaL palmW palmL
## 13 3.33 22.20 42.52 17.86 16.70
## 14 3.96 27.98 44.07 18.55 17.66
## 186 4.47 24.05 40.04 17.11 14.47
## 191 3.97 28.19 43.94 17.28 16.36
## 195 5.51 36.06 57.97 20.57 21.45
## 197 3.94 30.14 48.09 20.30 18.12
Ah, from this plot it seems clear there are a couple of outliers up there. Two of my largest rusty males. So perhaps what I take away is that as crayfish get particularly large their shape is changing in a different way than before that point?
Homogeneity of Variances
Testing for whether variance differs between the groups. Bartlett test: looks good except rostrum width and acumen length are on the cusp of significance.
## [1] "carapaceL"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 0.022912, df = 1, p-value = 0.8797
##
## [1] "areolaL"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 0.0045857, df = 1, p-value = 0.946
##
## [1] "areolaW"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 0.048879, df = 1, p-value = 0.825
##
## [1] "carapaceW"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 0.17269, df = 1, p-value = 0.6777
##
## [1] "acumenL"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 3.4251, df = 1, p-value = 0.06421
##
## [1] "rostrumW"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 3.9795, df = 1, p-value = 0.04606
##
## [1] "dactylL"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 2.4174, df = 1, p-value = 0.12
##
## [1] "chelaL"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 2.5287, df = 1, p-value = 0.1118
##
## [1] "palmW"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 1.4221, df = 1, p-value = 0.2331
##
## [1] "palmL"
##
## Bartlett test of homogeneity of variances
##
## data: train[[i]] by group
## Bartlett's K-squared = 2.2438, df = 1, p-value = 0.1341
Now, let’s figure out how to run the analysis to predict the identity of each individual from an invaded river.
## Call:
## lda(group ~ carapaceL + carapaceW + areolaL + areolaW + acumenL +
## rostrumW + dactylL + chelaL + palmW + palmL, data = train,
## na.action = "na.omit")
##
## Prior probabilities of groups:
## rusty sanborn
## 0.390625 0.609375
##
## Group means:
## carapaceL carapaceW areolaL areolaW acumenL rostrumW dactylL
## rusty 38.52160 20.08520 13.61280 2.071600 3.224800 3.507200 20.33400
## sanborn 29.69436 14.81538 10.12026 1.894872 2.718974 2.739231 12.82667
## chelaL palmW palmL
## rusty 33.29040 13.627600 12.552800
## sanborn 22.30385 9.047692 9.460769
##
## Coefficients of linear discriminants:
## LD1
## carapaceL 0.03899889
## carapaceW 0.07964590
## areolaL -0.32913984
## areolaW 1.33778228
## acumenL -0.29734309
## rostrumW -1.85492327
## dactylL -0.40488119
## chelaL 0.32838823
## palmW -0.86592076
## palmL 0.76377563
## [1] sanborn rusty sanborn rusty rusty sanborn rusty sanborn
## [9] sanborn rusty sanborn sanborn sanborn sanborn sanborn rusty
## [17] rusty rusty rusty sanborn sanborn sanborn sanborn sanborn
## [25] sanborn sanborn rusty rusty rusty sanborn rusty sanborn
## [33] rusty sanborn rusty sanborn rusty sanborn rusty sanborn
## [41] sanborn sanborn rusty rusty sanborn sanborn sanborn rusty
## [49] rusty sanborn sanborn sanborn sanborn sanborn sanborn sanborn
## [57] sanborn rusty sanborn rusty rusty sanborn rusty rusty
## [65] rusty rusty rusty rusty sanborn rusty rusty rusty
## [73] rusty rusty rusty rusty rusty sanborn sanborn rusty
## [81] sanborn
## Levels: rusty sanborn
## LD1
## 72 0.25191116
## 73 -1.86866197
## 74 -0.32269238
## 75 -7.85454880
## 76 -0.80736868
## 77 -0.31189894
## 78 -1.53714209
## 79 0.07157432
## 80 1.29765100
## 81 -1.31729232
## 82 0.45497605
## 83 0.29248385
## 84 -0.29550998
## 85 1.00545158
## 86 1.07154355
## 87 -2.68388442
## 88 -1.96107101
## 89 -1.15320664
## 90 -0.67004210
## 91 0.75609790
## 92 1.34823904
## 96 2.71725162
## 97 0.59898378
## 98 2.30994945
## 99 0.45363243
## 100 1.64559732
## 101 -0.82987307
## 102 -1.54497455
## 103 -4.14345853
## 104 0.02328777
## 105 -1.06884765
## 106 0.49790637
## 107 -2.94084026
## 108 0.28398231
## 109 -1.37017826
## 110 1.25686092
## 111 -1.02845964
## 112 0.35307845
## 113 -1.26535746
## 114 0.49240070
## 115 1.79650192
## 116 4.14463928
## 117 -1.20193484
## 118 -0.86766313
## 119 1.09494153
## 120 1.00895055
## 121 0.43274281
## 122 -0.93233181
## 123 -1.74706118
## 124 0.42919366
## 125 0.09752004
## 126 1.30542802
## 127 3.76483330
## 130 0.01196597
## 131 -0.11203819
## 132 0.38920496
## 133 -0.39783753
## 134 -0.71292728
## 135 0.15081096
## 136 -1.20270311
## 137 -0.94984142
## 138 -0.44598369
## 139 -1.45065050
## 140 -1.23369701
## 141 -3.70628902
## 142 -0.48535588
## 143 -1.01285340
## 144 -1.59989478
## 145 0.47625218
## 146 -1.04502688
## 147 -1.46302399
## 148 -2.54392939
## 149 -0.55708820
## 150 -1.96445806
## 151 -2.91248582
## 152 -2.62697553
## 153 -0.74551915
## 154 0.85032271
## 155 0.49188852
## 156 -1.32222096
## 157 -0.32991044
## [1] "group" "id" "pop" "ind" "carapaceL"
## [6] "areolaL" "areolaW" "carapaceW" "acumenL" "rostrumW"
## [11] "dactylL" "chelaL" "palmW" "palmL" "prob.rus"
## [16] "prob.san" "LD1" "ldaclass"
## [1] 1 2 3 4 5 6
## pop
## ldaclass KDN KGL KOA KOB KOC KOK
## rusty 0.36 0.36 0.57 0.29 0.57 0.71
## sanborn 0.64 0.64 0.43 0.71 0.43 0.29
## [1] sanborn sanborn sanborn rusty rusty rusty sanborn sanborn
## [9] sanborn sanborn sanborn sanborn rusty rusty rusty rusty
## [17] rusty rusty sanborn rusty rusty sanborn rusty rusty
## [25] rusty rusty sanborn sanborn sanborn sanborn rusty sanborn
## [33] rusty rusty sanborn rusty sanborn sanborn rusty sanborn
## [41] sanborn rusty rusty rusty sanborn sanborn rusty sanborn
## [49] rusty rusty sanborn rusty rusty sanborn sanborn rusty
## [57] sanborn rusty rusty rusty sanborn rusty sanborn sanborn
## [65] sanborn sanborn rusty rusty
## Levels: rusty sanborn
## pop
## ldaclass HHT HRM HUX LLB TMR
## rusty 0.36 0.77 0.36 0.62 0.50
## sanborn 0.64 0.23 0.64 0.38 0.50
fit.train.cv <- lda(group ~ carapaceL + carapaceW + areolaL + areolaW + acumenL + rostrumW + dactylL + chelaL + palmW + palmL, data=train, na.action="na.omit", CV=T)
pred.allo <- predict(fit.train, newdata=train[,5:14])
ldahist(pred.allo$x, pred.allo$class)
train$prob.rus <- fit.train.cv$posterior[,"rusty"]
train$prob.san <- fit.train.cv$posterior[,"sanborn"]
train$LD1 <- pred.allo$x
train$ldaclass <- pred.allo$class
with(train, round(prop.table(table(ldaclass, pop), margin=2), digits=2))
## pop
## ldaclass BOK STW CRM SCL VSG
## rusty 0.86 1.00 0.00 0.08 0.15
## sanborn 0.14 0.00 1.00 0.92 0.85
with(train, plot(ldaclass ~ pop))
train$porder <- adply(.data=as.character(train$pop), .margins=1, .fun=switch, KOB=1, KOA=2, KOC=3, KOK=4, KDN=5, KGL=6, CRM=1, SCL=2, VSG=3, HUX=1, TMR=2, HHT=3, HRM=4, LLB=5, BOK=1, STW=12, NA)[,2]
train$pop <- reorder(train$pop, train$porder)
with(train, plot(carapaceL ~ LD1, col=c("red","blue")[train$group]))
with(train, plot(carapaceW ~ LD1, col=c("red","blue")[train$group]))
with(train, plot(areolaL ~ LD1, col=c("red","blue")[train$group]))
with(train, plot(areolaW ~ LD1, col=c("red","blue")[train$group]))
with(train, plot(chelaL ~ LD1, col=c("red","blue")[train$group]))
with(train, plot(palmW ~ LD1, col=c("red","blue")[train$group]))
xyplot(carapaceL + carapaceW + areolaL + areolaW + acumenL + rostrumW + dactylL + chelaL + palmW + palmL ~ LD1, data=train, group=group, auto.key=T, scales="free")
xyplot(areolaL + areolaL/carapaceL ~ LD1, data=train, group=group, auto.key=T, scales="free")
meangroup <- fit.train$mean
meanproj <- meangroup%*%fit.train$scaling
dist(meanproj)
## rusty
## sanborn 2.76001
First need to create the dataset with posterior probabilities for each group. Then switch to long format to make the barchart.
# with(huron, table(pop, Horder)) # HUX, TMR, HHT, HRM, LLB
# with(kokosing, table(pop, Korder)) # KOB, KOA, KOC, KOK, KDN, KGL
full <- rbind(train, kok, hur)
full <- arrange(full, group, pop, prob.rus)
long <- gather(full, key="assign", value="prob", 15:16)
names(long)
## [1] "group" "id" "pop" "ind" "carapaceL"
## [6] "areolaL" "areolaW" "carapaceW" "acumenL" "rostrumW"
## [11] "dactylL" "chelaL" "palmW" "palmL" "LD1"
## [16] "ldaclass" "porder" "assign" "prob"
long <- arrange(long, group, pop, assign)
long$group <- factor(toupper(long$group))
# change labels to give river names for clarity
long$popname <- factor(adply(.data=as.character(long$pop), .margins=1, .fun=switch,
CRM="Mohican R.", SCL="Salt Cr.", VSG="Vermilion R.", KOB="K1", KOA="K2", KOC="K3", KOK="K4", KDN="K5", KGL="K6", BOK="Bokes Cr.", STW="Stillwater Cr.", HUX="H1", TMR="H2", HHT="H3", HRM="H4", LLB="H5", NA)[,2])
Now generate the barcharts showing posterior probabilities for individuals within each group and population.
Allopatric sites:
Sympatric sites:
names(native)
## [1] "group" "id" "pop"
## [4] "ind" "key.species" "field.id"
## [7] "sex" "form" "carapaceL"
## [10] "areolaL" "areolaW" "carapaceW"
## [13] "acumenL" "rostrumW" "dactylL"
## [16] "chelaL" "palmW" "palmL"
## [19] "carpus.spines" "gonopodL" "mesial.projL"
## [22] "central.projL" "annulus.ventralis" "notes"
## [25] "carapaceWL" "areolaLcaraL" "areolaWL"
## [28] "rostrumWcaraL" "acumenLcaraL" "rostrumWacumenL"
## [31] "dactylLchL" "dactylLpalmL" "palmWchL"
## [34] "palmLchL" "palmWL" "chelaLcaraL"
## [37] "centralLgonoL" "centLmesLgonoL"
with(native, table(sex, form, exclude=F))
## form
## sex 1 2 <NA>
## f 0 0 34
## m 17 16 0
gono.allo <- na.omit(native[,c(1:4,9:18,20:22)])
gono.allo$porder <- adply(.data=as.character(gono.allo$pop), .margins=1, .fun=switch, KOB=1, KOA=2, KOC=3, KOK=4, KDN=5, KGL=6, CRM=1, SCL=2, VSG=3, HUX=1, TMR=2, HHT=3, HRM=4, LLB=5, BOK=1, STW=12, NA)[,2]
gono.allo$pop <- reorder(gono.allo$pop, gono.allo$porder)
fit.gono <- lda(group ~ carapaceL + gonopodL + mesial.projL + central.projL, data=gono.allo, na.action="na.omit")
fit.gono
## Call:
## lda(group ~ carapaceL + gonopodL + mesial.projL + central.projL,
## data = gono.allo, na.action = "na.omit")
##
## Prior probabilities of groups:
## rusty sanborn
## 0.375 0.625
##
## Group means:
## carapaceL gonopodL mesial.projL central.projL
## rusty 39.18167 15.44167 3.874167 4.790
## sanborn 29.59300 9.47900 1.301000 1.372
##
## Coefficients of linear discriminants:
## LD1
## carapaceL 0.5776023
## gonopodL -1.8503101
## mesial.projL 2.3759025
## central.projL -2.2810980
fit.gono.cv <- lda(group ~ carapaceL + carapaceW + areolaL + areolaW + acumenL + rostrumW + dactylL + chelaL + palmW + palmL + gonopodL + mesial.projL + central.projL, data=gono.allo, na.action="na.omit", CV=T)
ct <- table(gono.allo$group, fit.gono.cv$class)
diag(prop.table(ct,1))
## rusty sanborn
## 1 1
sum(diag(prop.table(ct)))
## [1] 1
fit.gono.cv
## $class
## [1] rusty rusty rusty rusty rusty rusty rusty rusty
## [9] rusty rusty rusty rusty sanborn sanborn sanborn sanborn
## [17] sanborn sanborn sanborn sanborn sanborn sanborn sanborn sanborn
## [25] sanborn sanborn sanborn sanborn sanborn sanborn sanborn sanborn
## Levels: rusty sanborn
##
## $posterior
## rusty sanborn
## 8 1.000000e+00 9.788181e-17
## 9 9.857756e-01 1.422437e-02
## 10 1.000000e+00 6.284654e-13
## 11 1.000000e+00 4.306775e-28
## 12 1.000000e+00 6.726268e-16
## 13 1.000000e+00 4.175299e-13
## 14 1.000000e+00 1.952484e-17
## 193 1.000000e+00 1.583360e-18
## 194 9.999999e-01 8.481714e-08
## 195 5.504502e-01 4.495498e-01
## 196 1.000000e+00 9.895939e-26
## 197 1.000000e+00 2.281289e-23
## 22 4.682414e-16 1.000000e+00
## 23 1.139974e-10 1.000000e+00
## 24 1.474120e-11 1.000000e+00
## 25 5.660141e-16 1.000000e+00
## 26 2.238317e-20 1.000000e+00
## 27 1.010931e-14 1.000000e+00
## 179 1.086300e-20 1.000000e+00
## 180 9.026752e-29 1.000000e+00
## 181 3.265654e-16 1.000000e+00
## 182 6.808643e-17 1.000000e+00
## 183 2.228447e-08 1.000000e+00
## 184 3.893949e-07 9.999996e-01
## 185 2.518724e-25 1.000000e+00
## 212 7.255777e-21 1.000000e+00
## 217 1.055236e-17 1.000000e+00
## 218 4.828139e-13 1.000000e+00
## 220 4.028757e-19 1.000000e+00
## 222 3.094562e-15 1.000000e+00
## 223 8.107641e-38 1.000000e+00
## 224 1.395243e-02 9.860476e-01
##
## $terms
## group ~ carapaceL + carapaceW + areolaL + areolaW + acumenL +
## rostrumW + dactylL + chelaL + palmW + palmL + gonopodL +
## mesial.projL + central.projL
## attr(,"variables")
## list(group, carapaceL, carapaceW, areolaL, areolaW, acumenL,
## rostrumW, dactylL, chelaL, palmW, palmL, gonopodL, mesial.projL,
## central.projL)
## attr(,"factors")
## carapaceL carapaceW areolaL areolaW acumenL rostrumW dactylL
## group 0 0 0 0 0 0 0
## carapaceL 1 0 0 0 0 0 0
## carapaceW 0 1 0 0 0 0 0
## areolaL 0 0 1 0 0 0 0
## areolaW 0 0 0 1 0 0 0
## acumenL 0 0 0 0 1 0 0
## rostrumW 0 0 0 0 0 1 0
## dactylL 0 0 0 0 0 0 1
## chelaL 0 0 0 0 0 0 0
## palmW 0 0 0 0 0 0 0
## palmL 0 0 0 0 0 0 0
## gonopodL 0 0 0 0 0 0 0
## mesial.projL 0 0 0 0 0 0 0
## central.projL 0 0 0 0 0 0 0
## chelaL palmW palmL gonopodL mesial.projL central.projL
## group 0 0 0 0 0 0
## carapaceL 0 0 0 0 0 0
## carapaceW 0 0 0 0 0 0
## areolaL 0 0 0 0 0 0
## areolaW 0 0 0 0 0 0
## acumenL 0 0 0 0 0 0
## rostrumW 0 0 0 0 0 0
## dactylL 0 0 0 0 0 0
## chelaL 1 0 0 0 0 0
## palmW 0 1 0 0 0 0
## palmL 0 0 1 0 0 0
## gonopodL 0 0 0 1 0 0
## mesial.projL 0 0 0 0 1 0
## central.projL 0 0 0 0 0 1
## attr(,"term.labels")
## [1] "carapaceL" "carapaceW" "areolaL" "areolaW"
## [5] "acumenL" "rostrumW" "dactylL" "chelaL"
## [9] "palmW" "palmL" "gonopodL" "mesial.projL"
## [13] "central.projL"
## attr(,"order")
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1
## attr(,"intercept")
## [1] 1
## attr(,"response")
## [1] 1
## attr(,".Environment")
## <environment: R_GlobalEnv>
## attr(,"predvars")
## list(group, carapaceL, carapaceW, areolaL, areolaW, acumenL,
## rostrumW, dactylL, chelaL, palmW, palmL, gonopodL, mesial.projL,
## central.projL)
## attr(,"dataClasses")
## group carapaceL carapaceW areolaL areolaW
## "factor" "numeric" "numeric" "numeric" "numeric"
## acumenL rostrumW dactylL chelaL palmW
## "numeric" "numeric" "numeric" "numeric" "numeric"
## palmL gonopodL mesial.projL central.projL
## "numeric" "numeric" "numeric" "numeric"
##
## $call
## lda(formula = group ~ carapaceL + carapaceW + areolaL + areolaW +
## acumenL + rostrumW + dactylL + chelaL + palmW + palmL + gonopodL +
## mesial.projL + central.projL, data = gono.allo, CV = T, na.action = "na.omit")
##
## $xlevels
## named list()
plot(fit.gono)
So using the gonopod measurements provides perfect discrimination. Now have to try this out on the sympatric data set.
names(gono.allo)
## [1] "group" "id" "pop" "ind"
## [5] "carapaceL" "areolaL" "areolaW" "carapaceW"
## [9] "acumenL" "rostrumW" "dactylL" "chelaL"
## [13] "palmW" "palmL" "gonopodL" "mesial.projL"
## [17] "central.projL" "porder"
pred.gono.allo <- predict(fit.gono, newdata=gono.allo)
ldahist(pred.gono.allo$x, pred.gono.allo$class)
gono.allo$prob.rus <- fit.gono.cv$posterior[,"rusty"]
gono.allo$prob.san <- fit.gono.cv$posterior[,"sanborn"]
gono.allo$LD1 <- pred.gono.allo$x
gono.allo$ldaclass <- pred.gono.allo$class
with(gono.allo, round(prop.table(table(ldaclass, pop), margin=2), digits=2))
## pop
## ldaclass BOK CRM SCL VSG STW
## rusty 1 0 0 0 1
## sanborn 0 1 1 1 0
with(gono.allo, plot(ldaclass ~ pop))
Huron gonopods
hur.gono <- na.omit(huron[,c(1:4,9:18,20:22)])
names(hur.gono)
## [1] "group" "id" "pop" "ind"
## [5] "carapaceL" "areolaL" "areolaW" "carapaceW"
## [9] "acumenL" "rostrumW" "dactylL" "chelaL"
## [13] "palmW" "palmL" "gonopodL" "mesial.projL"
## [17] "central.projL"
hur.gono$porder <- adply(.data=as.character(hur.gono$pop), .margins=1, .fun=switch, KOB=1, KOA=2, KOC=3, KOK=4, KDN=5, KGL=6, CRM=1, SCL=2, VSG=3, HUX=1, TMR=2, HHT=3, HRM=4, LLB=5, BOK=1, STW=12, NA)[,2]
hur.gono$pop <- reorder(hur.gono$pop, hur.gono$porder)
pred.hur.gono <- predict(fit.gono, newdata=hur.gono)
pred.hur.gono$class
## [1] sanborn sanborn sanborn sanborn sanborn sanborn sanborn rusty
## [9] sanborn rusty rusty rusty rusty sanborn sanborn sanborn
## [17] sanborn sanborn sanborn sanborn rusty rusty rusty rusty
## [25] rusty sanborn sanborn sanborn sanborn sanborn sanborn sanborn
## [33] sanborn
## Levels: rusty sanborn
with(pred.hur.gono, ldahist(x, class))
hur.gono$prob.rus <- pred.hur.gono$posterior[,"rusty"]
hur.gono$prob.san <- pred.hur.gono$posterior[,"sanborn"]
hur.gono$LD1 <- pred.hur.gono$x
hur.gono$ldaclass <- pred.hur.gono$class
with(hur.gono, round(prop.table(table(ldaclass, pop), margin=2), digits=2))
## pop
## ldaclass HUX TMR HHT HRM LLB
## rusty 0.00 0.00 0.00 0.83 0.83
## sanborn 1.00 1.00 1.00 0.17 0.17
with(hur.gono, plot(ldaclass ~ pop, ylab="Posterior probability of rusty", xlab="River position, upstream to downstream", main="Huron R. predicted species identity by site"))
Kokosing gonopods
kok.gono <- na.omit(kokosing[,c(1:4,9:18,20:22)])
names(kok.gono)
## [1] "group" "id" "pop" "ind"
## [5] "carapaceL" "areolaL" "areolaW" "carapaceW"
## [9] "acumenL" "rostrumW" "dactylL" "chelaL"
## [13] "palmW" "palmL" "gonopodL" "mesial.projL"
## [17] "central.projL"
kok.gono$porder <- adply(.data=as.character(hur.gono$pop), .margins=1, .fun=switch, KOB=1, KOA=2, KOC=3, KOK=4, KDN=5, KGL=6, CRM=1, SCL=2, VSG=3, HUX=1, TMR=2, HHT=3, HRM=4, LLB=5, BOK=1, STW=12, NA)[,2]
hur.gono$pop <- reorder(hur.gono$pop, hur.gono$porder)
pred.kok.gono <- predict(fit.gono, newdata=kok.gono)
pred.kok.gono$class
## [1] sanborn sanborn rusty rusty rusty rusty rusty sanborn
## [9] sanborn sanborn sanborn sanborn sanborn rusty sanborn rusty
## [17] sanborn rusty sanborn sanborn sanborn rusty rusty rusty
## [25] rusty rusty rusty rusty rusty rusty rusty rusty
## [33] rusty
## Levels: rusty sanborn
with(pred.kok.gono, ldahist(x, class))
kok.gono$prob.rus <- pred.kok.gono$posterior[,"rusty"]
kok.gono$prob.san <- pred.kok.gono$posterior[,"sanborn"]
kok.gono$LD1 <- pred.kok.gono$x
kok.gono$ldaclass <- pred.kok.gono$class
with(kok.gono, round(prop.table(table(ldaclass, pop), margin=2), digits=2))
## pop
## ldaclass KDN KGL KOA KOB KOC KOK
## rusty 0.71 0.00 0.43 0.00 0.71 1.00
## sanborn 0.29 1.00 0.57 1.00 0.29 0.00
with(kok.gono, plot(ldaclass ~ pop, ylab="Posterior probability of rusty", xlab="River position, upstream to downstream", main="Kokosing R. predicted species identity by site"))
Barcharts
fullgono <- rbind(gono.allo, kok.gono, hur.gono)
fullgono <- arrange(fullgono, group, pop, prob.rus)
names(fullgono)
## [1] "group" "id" "pop" "ind"
## [5] "carapaceL" "areolaL" "areolaW" "carapaceW"
## [9] "acumenL" "rostrumW" "dactylL" "chelaL"
## [13] "palmW" "palmL" "gonopodL" "mesial.projL"
## [17] "central.projL" "porder" "prob.rus" "prob.san"
## [21] "LD1" "ldaclass"
# if all traits plus gono
longgono <- gather(fullgono, key="assign", value="prob", 19:20)
# if gono only
# longgono <- gather(fullgono, key="assign", value="prob", 18:19)
names(longgono)
## [1] "group" "id" "pop" "ind"
## [5] "carapaceL" "areolaL" "areolaW" "carapaceW"
## [9] "acumenL" "rostrumW" "dactylL" "chelaL"
## [13] "palmW" "palmL" "gonopodL" "mesial.projL"
## [17] "central.projL" "porder" "LD1" "ldaclass"
## [21] "assign" "prob"
longgono <- arrange(longgono, group, pop, assign)
longgono$group <- factor(toupper(longgono$group))
# change labels to give river names for clarity
longgono$popname <- factor(adply(.data=as.character(longgono$pop), .margins=1, .fun=switch,
CRM="Mohican R.", SCL="Salt Cr.", VSG="Vermilion R.", KOB="K1", KOA="K2", KOC="K3", KOK="K4", KDN="K5", KGL="K6", BOK="Bokes Cr.", STW="Stillwater Cr.", HUX="H1", TMR="H2", HHT="H3", HRM="H4", LLB="H5", NA)[,2])
clr <- c("brown4", "lightblue")
a <- list(columns=2, text=c("O. rusticus", "O. sanbornii"), rectangles=F, rect=list(col=clr), cex=0.75)
b=strip.custom(par.strip.text=list(cex=0.75), bg="oldlace")
# allopatric
pdf(file="barchart_allopatry_gono.pdf", width=6, height=4)
barchart(prob ~ id | group:popname, groups=assign, data=longgono, stack=T, scales=list(relation="free", draw=F), subset=(group%in%c("SANBORN", "RUSTY")), layout=c(5,1), ylab=list(label="Posterior assignment probability", cex=0.9), main=list(label="Assignment probabilities in allopatry", cex=1), col=clr, auto.key=a, strip=F, strip.left=b)
dev.off()
## quartz_off_screen
## 2
# sympatric
b=strip.custom(par.strip.text=list(cex=0.75), bg="oldlace")
pdf(file="barchart_sympatry_gono.pdf", width=6, height=4)
barchart(prob ~ id | group:popname, groups=assign, data=longgono, stack=T, scales=list(relation="free", draw=F), subset=(group%in%c("KOKOSING","HURON")), layout=c(6,2), ylab=list(label="Posterior assignment probability", cex=0.9), main=list(label="Assignment probabilities in sympatry", cex=1), col=clr, index.cond=list(c(6:11,1:5)),auto.key=a, sub="upstream sites ----> downstream sites", strip.left=b, strip=F)
dev.off()
## quartz_off_screen
## 2